import requests
from lxml import etree
from time import sleep
import csv
import numpy as np

url = 'https://movie.douban.com/top250'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'}

title = []
links = []
director = []
scores = []
comment = []
summary = []

fp = open('./douban_top250.csv','w',encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(['电影名','电影详情页链接','导演、演员','评分','评分人数','简介'])

for i in range(0,226,25):
    url=f'https://movie.douban.com/top250?start={i}&filter='

    response = requests.get(url, headers=headers)

    sleep(1)
    html = response.text

    data = etree.HTML(html)
    li_list = data.xpath('//*[@id="content"]/div/div[1]/ol/li')

    for each in li_list:

        title = each.xpath('./div/div[2]/div[1]/a/span[1]/text()')
        title.append(title)

        link = each.xpath('./div/div[2]/div[1]/a/@href')
        links.append(link)

        director = each.xpath('./div/div[2]/div[2]/p[1]/text()')
        links.append(director)


        score = each.xpath('./div/div[2]/div[2]/div/span[2]/text()')
        scores.append(score)


        comment = each.xpath('./div/div[2]/div[2]/div/span[4]/text()')
        comment.append(comment)

        summary = each.xpath('./div/div[2]/div[2]/div/span[4]/text()')
        summary.append(summary)

        writer.writerow([title, link, director, score, comment, summary])
    print(f'————————————第{int((i / 25) + 1)}页爬取完毕！——————————————')
    fp.close()
    print("——————————————————————————————————爬虫结束！！！！！————————————————————————————————————————————————")